From 192bbdb484e6cc690ed8c0ea2eba218dcc60f9f4 Mon Sep 17 00:00:00 2001 From: Jan Heller Date: Fri, 23 May 2008 13:36:35 +0000 Subject: [PATCH] Added support for CPU acceleration, RGBA float -> RGB u8 and RGBA float -> 2008-05-23 Jan Heller Added support for CPU acceleration, RGBA float -> RGB u8 and RGBA float -> RGBA u8 SSE accelerated conversions. * babl/Makefile.am: Added babl-cpuaccel.[ch]. * babl/babl-cpuaccel.h: * babl/babl-cpuaccel.c (babl_cpu_accel_get_support), (babl_cpu_accel_set_use), (arch_get_vendor), (arch_accel_intel), (arch_accel_amd), (arch_accel_centaur), (arch_accel_cyrix), (sigill_handler), (arch_accel_sse_os_support), (arch_accel), (cpu_accel): Runtime CPU detection code. Ported from GIMP. * babl/babl-internal.h: Include babl-cpuaccel.h. * babl/babl-memory.c (babl_malloc): Make babl_malloc align memory to BABL_ALIGN==16 boundaries. * babl/babl.c (babl_init): Enabled CPU acceleration. * configure.ac: Added compile time MMX/SSE/AltiVec detection. Ported from GIMP. * extensions/Makefile.am: Added SSE_EXTRA_CFLAGS for sse-fixups.c compilation. * extensions/sse-fixups.c (conv_rgbaF_linear_rgb8_linear), (conv_rgbaF_linear_rgba8_linear), (init): Added RGBA float -> RGB u8 and RGBA float -> RGBA u8 SSE accelerated conversions. svn path=/trunk/; revision=316 --- ChangeLog | 24 ++ babl/Makefile.am | 6 +- babl/babl-cpuaccel.c | 497 ++++++++++++++++++++++++++++++++++++++++ babl/babl-cpuaccel.h | 43 ++++ babl/babl-internal.h | 1 + babl/babl-memory.c | 19 +- babl/babl.c | 2 + configure.ac | 119 ++++++++++ extensions/Makefile.am | 3 + extensions/sse-fixups.c | 159 +++++++++++++ 10 files changed, 864 insertions(+), 9 deletions(-) create mode 100644 babl/babl-cpuaccel.c create mode 100644 babl/babl-cpuaccel.h create mode 100644 extensions/sse-fixups.c diff --git a/ChangeLog b/ChangeLog index f5955a6..1b9b9f2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,27 @@ +2008-05-23 Jan Heller + + Added support for CPU acceleration, RGBA float -> RGB u8 and + RGBA float -> RGBA u8 SSE accelerated conversions. + + * babl/Makefile.am: Added babl-cpuaccel.[ch]. + * babl/babl-cpuaccel.h: + * babl/babl-cpuaccel.c (babl_cpu_accel_get_support), + (babl_cpu_accel_set_use), (arch_get_vendor), (arch_accel_intel), + (arch_accel_amd), (arch_accel_centaur), (arch_accel_cyrix), + (sigill_handler), (arch_accel_sse_os_support), (arch_accel), + (cpu_accel): Runtime CPU detection code. Ported from GIMP. + * babl/babl-internal.h: Include babl-cpuaccel.h. + * babl/babl-memory.c (babl_malloc): Make babl_malloc align memory to + BABL_ALIGN==16 boundaries. + * babl/babl.c (babl_init): Enabled CPU acceleration. + * configure.ac: Added compile time MMX/SSE/AltiVec detection. Ported + from GIMP. + * extensions/Makefile.am: Added SSE_EXTRA_CFLAGS for sse-fixups.c + compilation. + * extensions/sse-fixups.c (conv_rgbaF_linear_rgb8_linear), + (conv_rgbaF_linear_rgba8_linear), (init): Added RGBA float -> RGB u8 + and RGBA float -> RGBA u8 SSE accelerated conversions. + 2008-05-22 Jan Heller * extensions/util.h: changed fallback macro to have the diff --git a/babl/Makefile.am b/babl/Makefile.am index 36bcdd0..771fb2c 100644 --- a/babl/Makefile.am +++ b/babl/Makefile.am @@ -29,7 +29,8 @@ c_sources = \ babl-type.c \ babl-util.c \ babl-list.c \ - babl-hash-table.c + babl-hash-table.c \ + babl-cpuaccel.c h_sources = \ babl-db.h \ @@ -39,7 +40,8 @@ h_sources = \ babl-util.h \ babl.h \ babl-list.h \ - babl-hash-table.h + babl-hash-table.h \ + babl-cpuaccel.h library_includedir=$(includedir)/babl-$(BABL_API_VERSION)/babl library_include_HEADERS = \ diff --git a/babl/babl-cpuaccel.c b/babl/babl-cpuaccel.c new file mode 100644 index 0000000..2f45cfd --- /dev/null +++ b/babl/babl-cpuaccel.c @@ -0,0 +1,497 @@ +/* babl - dynamically extendable universal pixel conversion library. + * Copyright (C) 2005-2008, Øyvind Kolås and others. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General + * Public License along with this library; if not, see + * . + */ + +/* + * x86 bits Copyright (C) Manish Singh + */ + +/* + * PPC CPU acceleration detection was taken from DirectFB but seems to be + * originating from mpeg2dec with the following copyright: + * + * Copyright (C) 1999-2001 Aaron Holtzman + */ + +#include "config.h" + +#include +#include +#include + +#include "babl-cpuaccel.h" + +typedef unsigned int gboolean; +typedef unsigned int guint32; +typedef int gint; +typedef char gchar; +#define TRUE 1 +#define FALSE 0 +#define G_GNUC_CONST + +static BablCpuAccelFlags cpu_accel (void) G_GNUC_CONST; + +static gboolean use_cpu_accel = TRUE; + + +/** + * babl_cpu_accel_get_support: + * + * Query for CPU acceleration support. + * + * Return value: #BablCpuAccelFlags as supported by the CPU. + */ +BablCpuAccelFlags +babl_cpu_accel_get_support (void) +{ + return use_cpu_accel ? cpu_accel () : BABL_CPU_ACCEL_NONE; +} + +/** + * babl_cpu_accel_set_use: + * @use: whether to use CPU acceleration features or not + * + * This function is for internal use only. + */ +void +babl_cpu_accel_set_use (gboolean use) +{ + use_cpu_accel = use ? TRUE : FALSE; +} + + +#if defined(ARCH_X86) && defined(USE_MMX) && defined(__GNUC__) + +#define HAVE_ACCEL 1 + + +typedef enum +{ + ARCH_X86_VENDOR_NONE, + ARCH_X86_VENDOR_INTEL, + ARCH_X86_VENDOR_AMD, + ARCH_X86_VENDOR_CENTAUR, + ARCH_X86_VENDOR_CYRIX, + ARCH_X86_VENDOR_NSC, + ARCH_X86_VENDOR_TRANSMETA, + ARCH_X86_VENDOR_NEXGEN, + ARCH_X86_VENDOR_RISE, + ARCH_X86_VENDOR_UMC, + ARCH_X86_VENDOR_SIS, + ARCH_X86_VENDOR_UNKNOWN = 0xff +} X86Vendor; + +enum +{ + ARCH_X86_INTEL_FEATURE_MMX = 1 << 23, + ARCH_X86_INTEL_FEATURE_XMM = 1 << 25, + ARCH_X86_INTEL_FEATURE_XMM2 = 1 << 26, + + ARCH_X86_AMD_FEATURE_MMXEXT = 1 << 22, + ARCH_X86_AMD_FEATURE_3DNOW = 1 << 31, + + ARCH_X86_CENTAUR_FEATURE_MMX = 1 << 23, + ARCH_X86_CENTAUR_FEATURE_MMXEXT = 1 << 24, + ARCH_X86_CENTAUR_FEATURE_3DNOW = 1 << 31, + + ARCH_X86_CYRIX_FEATURE_MMX = 1 << 23, + ARCH_X86_CYRIX_FEATURE_MMXEXT = 1 << 24 +}; + +enum +{ + ARCH_X86_INTEL_FEATURE_PNI = 1 << 0 +}; + +#if !defined(ARCH_X86_64) && (defined(PIC) || defined(__PIC__)) +#define cpuid(op,eax,ebx,ecx,edx) \ + __asm__ ("movl %%ebx, %%esi\n\t" \ + "cpuid\n\t" \ + "xchgl %%ebx,%%esi" \ + : "=a" (eax), \ + "=S" (ebx), \ + "=c" (ecx), \ + "=d" (edx) \ + : "0" (op)) +#else +#define cpuid(op,eax,ebx,ecx,edx) \ + __asm__ ("cpuid" \ + : "=a" (eax), \ + "=b" (ebx), \ + "=c" (ecx), \ + "=d" (edx) \ + : "0" (op)) +#endif + + +static X86Vendor +arch_get_vendor (void) +{ + guint32 eax, ebx, ecx, edx; + gchar id[16]; + +#ifndef ARCH_X86_64 + /* Only need to check this on ia32 */ + __asm__ ("pushfl\n\t" + "pushfl\n\t" + "popl %0\n\t" + "movl %0,%1\n\t" + "xorl $0x200000,%0\n\t" + "pushl %0\n\t" + "popfl\n\t" + "pushfl\n\t" + "popl %0\n\t" + "popfl" + : "=a" (eax), + "=c" (ecx) + : + : "cc"); + + if (eax == ecx) + return ARCH_X86_VENDOR_NONE; +#endif + + cpuid (0, eax, ebx, ecx, edx); + + if (eax == 0) + return ARCH_X86_VENDOR_NONE; + + *(int *)&id[0] = ebx; + *(int *)&id[4] = edx; + *(int *)&id[8] = ecx; + + id[12] = '\0'; + +#ifdef ARCH_X86_64 + if (strcmp (id, "AuthenticAMD") == 0) + return ARCH_X86_VENDOR_AMD; + else if (strcmp (id, "GenuineIntel") == 0) + return ARCH_X86_VENDOR_INTEL; +#else + if (strcmp (id, "GenuineIntel") == 0) + return ARCH_X86_VENDOR_INTEL; + else if (strcmp (id, "AuthenticAMD") == 0) + return ARCH_X86_VENDOR_AMD; + else if (strcmp (id, "CentaurHauls") == 0) + return ARCH_X86_VENDOR_CENTAUR; + else if (strcmp (id, "CyrixInstead") == 0) + return ARCH_X86_VENDOR_CYRIX; + else if (strcmp (id, "Geode by NSC") == 0) + return ARCH_X86_VENDOR_NSC; + else if (strcmp (id, "GenuineTMx86") == 0 || + strcmp (id, "TransmetaCPU") == 0) + return ARCH_X86_VENDOR_TRANSMETA; + else if (strcmp (id, "NexGenDriven") == 0) + return ARCH_X86_VENDOR_NEXGEN; + else if (strcmp (id, "RiseRiseRise") == 0) + return ARCH_X86_VENDOR_RISE; + else if (strcmp (id, "UMC UMC UMC ") == 0) + return ARCH_X86_VENDOR_UMC; + else if (strcmp (id, "SiS SiS SiS ") == 0) + return ARCH_X86_VENDOR_SIS; +#endif + + return ARCH_X86_VENDOR_UNKNOWN; +} + +static guint32 +arch_accel_intel (void) +{ + guint32 caps = 0; + +#ifdef USE_MMX + { + guint32 eax, ebx, ecx, edx; + + cpuid (1, eax, ebx, ecx, edx); + + if ((edx & ARCH_X86_INTEL_FEATURE_MMX) == 0) + return 0; + + caps = BABL_CPU_ACCEL_X86_MMX; + +#ifdef USE_SSE + if (edx & ARCH_X86_INTEL_FEATURE_XMM) + caps |= BABL_CPU_ACCEL_X86_SSE | BABL_CPU_ACCEL_X86_MMXEXT; + + if (edx & ARCH_X86_INTEL_FEATURE_XMM2) + caps |= BABL_CPU_ACCEL_X86_SSE2; + + if (ecx & ARCH_X86_INTEL_FEATURE_PNI) + caps |= BABL_CPU_ACCEL_X86_SSE3; +#endif /* USE_SSE */ + } +#endif /* USE_MMX */ + + return caps; +} + +static guint32 +arch_accel_amd (void) +{ + guint32 caps; + + caps = arch_accel_intel (); + +#ifdef USE_MMX + { + guint32 eax, ebx, ecx, edx; + + cpuid (0x80000000, eax, ebx, ecx, edx); + + if (eax < 0x80000001) + return caps; + +#ifdef USE_SSE + cpuid (0x80000001, eax, ebx, ecx, edx); + + if (edx & ARCH_X86_AMD_FEATURE_3DNOW) + caps |= BABL_CPU_ACCEL_X86_3DNOW; + + if (edx & ARCH_X86_AMD_FEATURE_MMXEXT) + caps |= BABL_CPU_ACCEL_X86_MMXEXT; +#endif /* USE_SSE */ + } +#endif /* USE_MMX */ + + return caps; +} + +static guint32 +arch_accel_centaur (void) +{ + guint32 caps; + + caps = arch_accel_intel (); + +#ifdef USE_MMX + { + guint32 eax, ebx, ecx, edx; + + cpuid (0x80000000, eax, ebx, ecx, edx); + + if (eax < 0x80000001) + return caps; + + cpuid (0x80000001, eax, ebx, ecx, edx); + + if (edx & ARCH_X86_CENTAUR_FEATURE_MMX) + caps |= BABL_CPU_ACCEL_X86_MMX; + +#ifdef USE_SSE + if (edx & ARCH_X86_CENTAUR_FEATURE_3DNOW) + caps |= BABL_CPU_ACCEL_X86_3DNOW; + + if (edx & ARCH_X86_CENTAUR_FEATURE_MMXEXT) + caps |= BABL_CPU_ACCEL_X86_MMXEXT; +#endif /* USE_SSE */ + } +#endif /* USE_MMX */ + + return caps; +} + +static guint32 +arch_accel_cyrix (void) +{ + guint32 caps; + + caps = arch_accel_intel (); + +#ifdef USE_MMX + { + guint32 eax, ebx, ecx, edx; + + cpuid (0, eax, ebx, ecx, edx); + + if (eax != 2) + return caps; + + cpuid (0x80000001, eax, ebx, ecx, edx); + + if (edx & ARCH_X86_CYRIX_FEATURE_MMX) + caps |= BABL_CPU_ACCEL_X86_MMX; + +#ifdef USE_SSE + if (edx & ARCH_X86_CYRIX_FEATURE_MMXEXT) + caps |= BABL_CPU_ACCEL_X86_MMXEXT; +#endif /* USE_SSE */ + } +#endif /* USE_MMX */ + + return caps; +} + +#ifdef USE_SSE +static jmp_buf sigill_return; + +static void +sigill_handler (gint n) +{ + longjmp (sigill_return, 1); +} + +static gboolean +arch_accel_sse_os_support (void) +{ + if (setjmp (sigill_return)) + { + return FALSE; + } + else + { + signal (SIGILL, sigill_handler); + __asm__ __volatile__ ("xorps %xmm0, %xmm0"); + signal (SIGILL, SIG_DFL); + } + + return TRUE; +} +#endif /* USE_SSE */ + +static guint32 +arch_accel (void) +{ + guint32 caps; + X86Vendor vendor; + + vendor = arch_get_vendor (); + + switch (vendor) + { + case ARCH_X86_VENDOR_NONE: + caps = 0; + break; + + case ARCH_X86_VENDOR_AMD: + caps = arch_accel_amd (); + break; + + case ARCH_X86_VENDOR_CENTAUR: + caps = arch_accel_centaur (); + break; + + case ARCH_X86_VENDOR_CYRIX: + case ARCH_X86_VENDOR_NSC: + caps = arch_accel_cyrix (); + break; + + /* check for what Intel speced, even if UNKNOWN */ + default: + caps = arch_accel_intel (); + break; + } + +#ifdef USE_SSE + if ((caps & BABL_CPU_ACCEL_X86_SSE) && !arch_accel_sse_os_support ()) + caps &= ~(BABL_CPU_ACCEL_X86_SSE | BABL_CPU_ACCEL_X86_SSE2); +#endif + + return caps; +} + +#endif /* ARCH_X86 && USE_MMX && __GNUC__ */ + + +#if defined(ARCH_PPC) && defined (USE_ALTIVEC) + +#if defined(HAVE_ALTIVEC_SYSCTL) + +#include + +#define HAVE_ACCEL 1 + +static guint32 +arch_accel (void) +{ + gint sels[2] = { CTL_HW, HW_VECTORUNIT }; + gboolean has_vu = FALSE; + gsize length = sizeof(has_vu); + gint err; + + err = sysctl (sels, 2, &has_vu, &length, NULL, 0); + + if (err == 0 && has_vu) + return BABL_CPU_ACCEL_PPC_ALTIVEC; + + return 0; +} + +#elif defined(__GNUC__) + +#define HAVE_ACCEL 1 + +static sigjmp_buf jmpbuf; +static volatile sig_atomic_t canjump = 0; + +static void +sigill_handler (gint sig) +{ + if (!canjump) + { + signal (sig, SIG_DFL); + raise (sig); + } + + canjump = 0; + siglongjmp (jmpbuf, 1); +} + +static guint32 +arch_accel (void) +{ + signal (SIGILL, sigill_handler); + + if (sigsetjmp (jmpbuf, 1)) + { + signal (SIGILL, SIG_DFL); + return 0; + } + + canjump = 1; + + asm volatile ("mtspr 256, %0\n\t" + "vand %%v0, %%v0, %%v0" + : + : "r" (-1)); + + signal (SIGILL, SIG_DFL); + + return BABL_CPU_ACCEL_PPC_ALTIVEC; +} +#endif /* __GNUC__ */ + +#endif /* ARCH_PPC && USE_ALTIVEC */ + + +static BablCpuAccelFlags +cpu_accel (void) +{ +#ifdef HAVE_ACCEL + static guint32 accel = ~0U; + + if (accel != ~0U) + return accel; + + accel = arch_accel (); + + return (BablCpuAccelFlags) accel; + +#else /* !HAVE_ACCEL */ + return BABL_CPU_ACCEL_NONE; +#endif +} diff --git a/babl/babl-cpuaccel.h b/babl/babl-cpuaccel.h new file mode 100644 index 0000000..0fd4c4b --- /dev/null +++ b/babl/babl-cpuaccel.h @@ -0,0 +1,43 @@ +/* babl - dynamically extendable universal pixel conversion library. + * Copyright (C) 2005-2008, Øyvind Kolås and others. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General + * Public License along with this library; if not, see + * . + */ + +#ifndef _BABL_CPU_ACCEL_H +#define _BABL_CPU_ACCEL_H + +typedef enum +{ + BABL_CPU_ACCEL_NONE = 0x0, + + /* x86 accelerations */ + BABL_CPU_ACCEL_X86_MMX = 0x80000000, + BABL_CPU_ACCEL_X86_3DNOW = 0x40000000, + BABL_CPU_ACCEL_X86_MMXEXT = 0x20000000, + BABL_CPU_ACCEL_X86_SSE = 0x10000000, + BABL_CPU_ACCEL_X86_SSE2 = 0x08000000, + BABL_CPU_ACCEL_X86_SSE3 = 0x02000000, + + /* powerpc accelerations */ + BABL_CPU_ACCEL_PPC_ALTIVEC = 0x04000000 +} BablCpuAccelFlags; + + +BablCpuAccelFlags babl_cpu_accel_get_support (void); +void babl_cpu_accel_set_use (unsigned int use); + + +#endif /* _BABL_CPU_ACCEL_H */ diff --git a/babl/babl-internal.h b/babl/babl-internal.h index 7ceead3..d2e6f82 100644 --- a/babl/babl-internal.h +++ b/babl/babl-internal.h @@ -45,6 +45,7 @@ #include "babl-ids.h" #include "babl-util.h" #include "babl-memory.h" +#include "babl-cpuaccel.h" /* redefining some functions for the win32 platform */ #ifdef _WIN32 diff --git a/babl/babl-memory.c b/babl/babl-memory.c index 20f5a62..adaece4 100644 --- a/babl/babl-memory.c +++ b/babl/babl-memory.c @@ -47,9 +47,9 @@ typedef struct size_t size; } BablAllocInfo; -#define OFFSET (sizeof (BablAllocInfo)) - -#define BAI(ptr) ((BablAllocInfo *) (((char *) ptr) - OFFSET)) +#define BABL_ALIGN 16 +#define BABL_ALLOC (sizeof (BablAllocInfo) + sizeof (void *)) +#define BAI(ptr) ((BablAllocInfo *) *((void **) ptr - 1)) #define IS_BAI(ptr) (BAI (ptr)->signature == signature) /* runtime statistics: */ @@ -96,18 +96,23 @@ void * babl_malloc (size_t size) { char *ret; + int offset; babl_assert (size); functions_sanity (); - ret = malloc_f (size + OFFSET); + ret = malloc_f (BABL_ALLOC + BABL_ALIGN + size); if (!ret) babl_fatal ("args=(%i): failed", size); - BAI (ret + OFFSET)->signature = signature; - BAI (ret + OFFSET)->size = size; + offset = BABL_ALIGN - ((unsigned int) ret + BABL_ALLOC) % BABL_ALIGN; + ret = ret + BABL_ALLOC + offset; + + *((void **) ret - 1) = ret - BABL_ALLOC - offset; + BAI (ret)->signature = signature; + BAI (ret)->size = size; mallocs++; - return (void *) (ret + OFFSET); + return (void *) (ret); } /* Create a duplicate allocation of the same size, note diff --git a/babl/babl.c b/babl/babl.c index 410a557..97ec6fc 100644 --- a/babl/babl.c +++ b/babl/babl.c @@ -23,6 +23,8 @@ static int ref_count = 0; void babl_init (void) { + babl_cpu_accel_set_use (1); + if (ref_count++ == 0) { babl_internal_init (); diff --git a/configure.ac b/configure.ac index 7384c8f..b9c4671 100644 --- a/configure.ac +++ b/configure.ac @@ -244,6 +244,125 @@ AC_SUBST(MATH_LIB) AM_CONDITIONAL(OS_WIN32, test "$os_win32" = "yes") AM_CONDITIONAL(OS_UNIX, test "$os_win32" != "yes") + +dnl =========================================================================== + + +######################## +# Check for MMX assembly +######################## + +AC_ARG_ENABLE(mmx, + [ --enable-mmx enable MMX support (default=auto)],, + enable_mmx=$have_x86) + +AC_ARG_ENABLE(sse, + [ --enable-sse enable SSE support (default=auto)],, + enable_sse=$enable_mmx) + +if test "x$enable_mmx" = xyes; then + BABL_DETECT_CFLAGS(MMX_EXTRA_CFLAGS, '-mmmx') + SSE_EXTRA_CFLAGS= + + AC_MSG_CHECKING(whether we can compile MMX code) + + mmx_save_CFLAGS="$CFLAGS" + CFLAGS="$mmx_save_CFLAGS $MMX_EXTRA_CFLAGS" + + AC_COMPILE_IFELSE([asm ("movq 0, %mm0");], + + AC_DEFINE(USE_MMX, 1, [Define to 1 if MMX assembly is available.]) + AC_MSG_RESULT(yes) + + if test "x$enable_sse" = xyes; then + BABL_DETECT_CFLAGS(sse_flag, '-msse') + SSE_EXTRA_CFLAGS="$MMX_EXTRA_CFLAGS $sse_flag" + + AC_MSG_CHECKING(whether we can compile SSE code) + + CFLAGS="$CFLAGS $sse_flag" + + AC_COMPILE_IFELSE([asm ("movntps %xmm0, 0");], + AC_DEFINE(USE_SSE, 1, [Define to 1 if SSE assembly is available.]) + AC_MSG_RESULT(yes) + , + enable_sse=no + AC_MSG_RESULT(no) + AC_MSG_WARN([The assembler does not support the SSE command set.]) + ) + + fi + , + enable_mmx=no + AC_MSG_RESULT(no) + AC_MSG_WARN([The assembler does not support the MMX command set.]) + ) + + CFLAGS="$mmx_save_CFLAGS" + + AC_SUBST(MMX_EXTRA_CFLAGS) + AC_SUBST(SSE_EXTRA_CFLAGS) +fi + + +############################ +# Check for AltiVec assembly +############################ + +AC_ARG_ENABLE(altivec, + [ --enable-altivec enable AltiVec support (default=auto)],, + enable_altivec=$have_ppc) + +if test "x$enable_altivec" = xyes; then + + BABL_DETECT_CFLAGS(altivec_flag, '-faltivec' '-maltivec -mabi=altivec') + + ALTIVEC_EXTRA_CFLAGS= + case "$altivec_flag" in + -maltivec*) + altivec_save_CPPFLAGS="$CPPFLAGS" + CPPFLAGS="$altivec_save_CPPFLAGS $altivec_flag" + AC_CHECK_HEADERS(altivec.h, [ALTIVEC_EXTRA_CFLAGS="$altivec_flag"]) + CPPFLAGS="$altivec_save_CPPFLAGS" + ;; + *) + ALTIVEC_EXTRA_CFLAGS="$altivec_flag" + ;; + esac + AC_SUBST(ALTIVEC_EXTRA_CFLAGS) + + AC_MSG_CHECKING(whether we can compile AltiVec code) + + can_use_altivec=no + if test -z "$ALTIVEC_EXTRA_CFLAGS"; then + AC_MSG_RESULT(no) + AC_MSG_WARN([The compiler does not support the AltiVec command set.]) + else + case "$target_or_host" in + *-*-darwin*) + can_use_altivec=yes + AC_DEFINE(HAVE_ALTIVEC_SYSCTL, 1, + [Define to 1 if the altivec runtime test should use a sysctl.]) + ;; + *) + AC_COMPILE_IFELSE([asm ("vand %v0, %v0, %v0");], + can_use_altivec=yes, can_use_altivec=no) + ;; + esac + AC_MSG_RESULT($can_use_altivec) + + if test "x$can_use_altivec" = "xyes"; then + AC_DEFINE(USE_ALTIVEC, 1, [Define to 1 if AltiVec support is available.]) + else + enable_altivec=no + AC_MSG_WARN([The assembler does not support the AltiVec command set.]) + fi + fi + + enable_altivec="$can_use_altivec" +fi + + dnl =========================================================================== AC_SEARCH_LIBS([dlopen], [dl]) diff --git a/extensions/Makefile.am b/extensions/Makefile.am index 59668e2..2510a93 100644 --- a/extensions/Makefile.am +++ b/extensions/Makefile.am @@ -38,6 +38,9 @@ endif CIE-Lab$(SHREXT): CIE-Lab.c $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $< $(LDADD) + +sse-fixups$(SHREXT): sse-fixups.c + $(CC) $(CFLAGS) $(MMX_EXTRA_CFLAGS) $(SSE_EXTRA_CFLAGS) $(LDFLAGS) -o $@ $< $(LDADD) #lcms$(SHREXT): lcms.c # $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $< $(LDADD) `pkg-config lcms --cflags --libs` ############################################################################# diff --git a/extensions/sse-fixups.c b/extensions/sse-fixups.c new file mode 100644 index 0000000..5569a62 --- /dev/null +++ b/extensions/sse-fixups.c @@ -0,0 +1,159 @@ +/* babl - dynamically extendable universal pixel conversion library. + * Copyright (C) 2005-2008, Øyvind Kolås and others. + * + * SSE optimized conversion routines. + * Copyright (C) 2008, Jan Heller. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General + * Public License along with this library; if not, see + * . + */ + +#include "config.h" + +#if defined(__GNUC__) && (__GNUC__ >= 4) && defined(USE_SSE) && defined(USE_MMX) + +#include "babl.h" +#include "babl-cpuaccel.h" + +#define INLINE inline + +typedef float g4float __attribute__ ((vector_size (4*sizeof(float)))); +typedef int g2int __attribute__ ((vector_size (2*sizeof(int)))); + +#define g4float(a,b,c,d) ((g4float){a,b,c,d}) +#define g4float_all(val) g4float(val,val,val,val) +#define g4float_zero g4float_all(0.0) +#define g4float_ff g4float_all(255.0) + +#define g4float_max(a,b) __builtin_ia32_maxps(a, b) +#define g4float_min(a,b) __builtin_ia32_minps(a, b) +#define g4float_cvt2pi(a) __builtin_ia32_cvtps2pi(a) +#define g4float_movhl(a,b) __builtin_ia32_movhlps(a, b) +#define g4float_emms __builtin_ia32_emms + + +static INLINE long +conv_rgbaF_linear_rgb8_linear (unsigned char *src, + unsigned char *dst, + long samples) +{ + long n = samples; + g4float *g4src = (g4float *) src; + g4float v; + + union { + g2int si; + unsigned char c[8]; + } u; + + while (n--) + { + v = *g4src++ * g4float_ff; + v = g4float_min(v, g4float_ff); + v = g4float_max(v, g4float_zero); + u.si = g4float_cvt2pi (v); + *dst++ = u.c[0]; + *dst++ = u.c[4]; + v = g4float_movhl (v, v); + u.si = g4float_cvt2pi (v); + *dst++ = u.c[0]; + } + + g4float_emms (); + + return samples; +} + + +static INLINE long +conv_rgbaF_linear_rgba8_linear (unsigned char *src, + unsigned char *dst, + long samples) +{ + long n = samples; + g4float *g4src = (g4float *) src; + g4float v; + + union { + g2int si; + unsigned char c[8]; + } u; + + while (n--) + { + v = *g4src++ * g4float_ff; + v = g4float_min(v, g4float_ff); + v = g4float_max(v, g4float_zero); + u.si = g4float_cvt2pi (v); + *dst++ = u.c[0]; + *dst++ = u.c[4]; + v = g4float_movhl (v, v); + u.si = g4float_cvt2pi (v); + *dst++ = u.c[0]; + *dst++ = u.c[4]; + } + + g4float_emms (); + + return samples; +} + +#endif + +#define o(src, dst) \ + babl_conversion_new (src, dst, "linear", conv_ ## src ## _ ## dst, NULL) + +int init (void); + +int +init (void) +{ + Babl *rgbaF_linear = babl_format_new ( + babl_model ("RGBA"), + babl_type ("float"), + babl_component ("R"), + babl_component ("G"), + babl_component ("B"), + babl_component ("A"), + NULL); + Babl *rgba8_linear = babl_format_new ( + babl_model ("RGBA"), + babl_type ("u8"), + babl_component ("R"), + babl_component ("G"), + babl_component ("B"), + babl_component ("A"), + NULL); + Babl *rgb8_linear = babl_format_new ( + babl_model ("RGB"), + babl_type ("u8"), + babl_component ("R"), + babl_component ("G"), + babl_component ("B"), + NULL); + +#if defined(__GNUC__) && (__GNUC__ >= 4) && defined(USE_SSE) && defined(USE_MMX) + + if ((babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_MMX) && + (babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE)) + { + o (rgbaF_linear, rgb8_linear); + o (rgbaF_linear, rgba8_linear); + } + +#endif + + return 0; +} + -- 2.30.2